In [46]:
import pandas as pd
import urllib2
from bs4 import BeautifulSoup
import re
from sklearn.cluster import KMeans
import palettable as pal
from itertools import chain
import os
import os.path #? not sure if I need both
import glob
import numpy as np
from IPython.core.debugger import Tracer #used this to step into the function and debug it, also need line with Tracer()()
import matplotlib.pyplot as plt
from collections import Counter
import cPickle as cpk
from stackedBarGraph import StackedBarGrapher
SBG = StackedBarGrapher()
%matplotlib inline
Also, will need to export the following from MATLAB (1) RInumber, (2) CO number, (3) ion mode information
The KEGG CO numbers from the RI data are not unique. In MATLAB, I have created a new value 'RInumber' which is an arbitrary number for each 'mzRT' feature. The data exported out of MATLAB include that number, the corresponding KEGG CO number, and whether the feature was observed in positive or negative ion mode. These data will be uesd to create a lookup table which allow use of the CO numbers or RInumbers as needed.
In [47]:
mtabFile = 'RImetabolites_isomers.2015.08.27.csv' #first column is RInumber
In [48]:
CO_fromMATLAB=pd.read_csv(mtabFile, index_col='RInumber')
# CO_fromMATLAB=CO_RawData[CO_RawData.sum(axis=1)!=0]
#read in the data from MATLAB and take a quick look
CO_fromMATLAB.head(n=5)
Out[48]:
make the list of unique cNumbers here, do the KEGG thing and filter the list before I start splitting up the dataframes into data and metadata...
In [49]:
#make a list of the unique CO numbers for the CreateHash_COtoKO.py. Export the list as CSV
td = CO_fromMATLAB.groupby('cNumber').count()
COnumbers = td.drop(list(td.columns.values),axis=1)
del td
COnumbers.to_csv('exportCOnumbers.csv',header=True)
In [50]:
def findRInumber(dataIn,KEGGin):
#find possible RI numbers for a given KEGG number.
dataOut = []
for i,KEGG in enumerate(dataIn['KEGG']):
if KEGG == KEGGin:
t = dataIn.index[i]
dataOut.append(t)
return dataOut
##For example: this will give back one row, C18028 will be multiple
#m = findRInumber(forRelatedness,'C00078')
In [51]:
def convertRItoCO(dataIn,RIin):
#do the reverse, given an RInumber find the cNumber
dataOut = dataIn.loc[RIin].loc['cNumber']
return dataOut
##This will always be a single value
#m = convertRItoCO(forRelatedness,'RI2')
In [52]:
#slight change, no need to send in a comparison file if it always the same thing
def convertRItoCO2(RIin):
#do the reverse, given an RInumber find the cNumber
dataOut = CO_fromMATLAB.loc[RIin].loc['cNumber']
return dataOut
##This will always be a single value, also uses CO_fromMATLAB as input
This grabs the CO/KO links from the KEGG website. The actual code is in the CreateHash_COtoKO.py that Harriet wrote. Note that since the exportCOnumbers.csv file is a unique list of C number we essentially already have a lookup table for all the metabolites of interest.
In [53]:
if os.path.isfile('exportCOnumbers.csv' + '.pickle'):
#just read in the file
WorkingFile = cpk.load(open('exportCOnumbers.csv.pickle','r'))
else:
#need to make the file
filename = "CreateHash_COtoKO.py"
%run $filename exportCOnumbers.csv
#then read in the file
WorkingFile = cpk.load(open('exportCOnumbers.csv.pickle','r'))
In [54]:
def SplitCODict(WorkingFile):
CO_withoutKO={}
CO_withKO={}
for CO in WorkingFile.keys():
if WorkingFile[CO]['Related KO']==[]:
CO_withoutKO[CO]=WorkingFile[CO]
else:
CO_withKO[CO]=WorkingFile[CO]
return CO_withoutKO, CO_withKO
CO_withoutKO, CO_withKO=SplitCODict(WorkingFile)
print 'There are', len(CO_withKO), 'COs with an associated KO.', len(CO_withoutKO), 'are not associated with a KO.'
In [55]:
AllKO=[]
AllCO=[]
for key in CO_withKO:
AllKO.append(CO_withKO[key]['Related KO'])
AllCO.append(CO_withKO[key]['Related CO'])
AllKO=list(set([item for sublist in AllKO for item in sublist]))
AllCO=list(set([item for sublist in AllCO for item in sublist]))
# KO_limited_Norm2Mean=KO_Norm2Mean.loc[AllKO].dropna()
# CO_limited_Norm2Mean=CO_Norm2Mean.loc[AllCO].dropna()
In [56]:
#go through CO_RawData_all one row at a time (inefficient for sure, but I understand
#what is happening), then make a new column in CO_RawData_all that is True/False
CO_fromMATLAB['inList'] = ""
for idx in range(0,len(CO_fromMATLAB)):
# for idx in range(0):
fc = CO_fromMATLAB.ix[idx,'cNumber']
if fc in AllCO:
CO_fromMATLAB.ix[idx,'inList'] = True
else:
CO_fromMATLAB.ix[idx,'inList'] = False
In [57]:
#can't quite figure out how to do this in one step.
m = CO_fromMATLAB[CO_fromMATLAB['inList']==True]
CO_metadata_pruned = m.loc[:,['cNumber','ChargedMass','RT','ionMode']]
#this list of days is useful, so define it up front. Actually want something that can't change
#but had trouble getting a tuple to work as an index.
dayList = ['S1','S2','S3','S4','S5'] #this makes a list (mutable, can be changed)
#days = ('S1','S2','S3','S4','S5') #this makes a tuple (immutable)
CO_RawData_pruned = m.loc[:,dayList]
del m
This is the new version, with the extended metadata. Parse the file into data and metadata.
In [58]:
#read in the KO data
KO_RawData=pd.read_csv('newData/AllPhytoKegg_KO_redone.tab', index_col='gID', delimiter='\t')
KO_RawData=KO_RawData[KO_RawData.sum(axis=1)!=0]
In [59]:
cmap=pal.colorbrewer.sequential.YlOrRd_5.get_mpl_colormap()
fig, axs=plt.subplots(2,2)
fig.set_size_inches(12,12)
for ax in axs:
for a in ax:
a.set_ylim([0,1000])
CO_RawData_pruned.plot(kind='hist', bins=100,colormap=cmap, ax=axs[0][0])
axs[0][0].set_title('Histogram of CO "concentrations"', size='large')
KO_RawData.plot(kind='hist', bins=100,colormap=cmap,ax=axs[0][1])
axs[0][1].set_title('Histogram of KO TPM', size='large')
CO_RawData_pruned.plot(kind='hist', bins=100,colormap=cmap, range = [0,0.1],ax=axs[1][0])
KO_RawData.plot(kind='hist', bins=100,colormap=cmap, range = [0,50],ax=axs[1][1])
Out[59]:
In [60]:
def NormalizeToMean(DF):
DF_meanNorm=DF.copy()
out=DF_meanNorm.copy()
DF_meanNorm['mean']=DF.mean(axis=1)
for i in out.columns:
out[i]=DF_meanNorm[i]/DF_meanNorm['mean']
DF_meanNorm=DF_meanNorm.T.drop('mean').T
return out
def NormalizeToMax(DF):
DF_meanNorm=DF.copy()
out=DF_meanNorm.copy()
DF_meanNorm['max']=DF.max(axis=1)
for i in out.columns:
out[i]=DF_meanNorm[i]/DF_meanNorm['max']
DF_meanNorm=DF_meanNorm.T.drop('max').T
return out
def NormalizeToMean_CV(DF):
out=DF.copy()
out['mean']=DF.mean(axis=1)
out['SD']=DF.std(axis=1)
out['CV']=out['SD']/out['mean']
return out
In [61]:
#several options for normalizing the data
CO_Norm2Mean=NormalizeToMean(CO_RawData_pruned) #this is what gets used in the original code
KO_Norm2Mean=NormalizeToMean(KO_RawData) #this is what gets used in the original code
CO_Norm2Max=NormalizeToMax(CO_RawData_pruned)
KO_Norm2Max=NormalizeToMax(KO_RawData)
cmap=pal.colorbrewer.sequential.YlOrRd_5.get_mpl_colormap()
fig, axs=plt.subplots(1,2)
fig.set_size_inches(12,6)
kplt=KO_Norm2Mean.plot(kind='hist', bins=100, title='KO Mean Normalized', colormap=cmap, ax=axs[1])
cplt=CO_Norm2Mean.plot(kind='hist', bins=100, title='CO Mean Normalized', colormap=cmap, ax=axs[0])
fig, axs=plt.subplots(1,2)
fig.set_size_inches(12,6)
kplt=KO_Norm2Max.plot(kind='hist', bins=100, title='KO Max Normalized', colormap=cmap, ax=axs[1])
cplt=CO_Norm2Max.plot(kind='hist', bins=100, title='CO Max Normalized', colormap=cmap, ax=axs[0])
In [62]:
#could also try the normalize to mean, CV
cmap=pal.colorbrewer.diverging.PRGn_5.get_mpl_colormap()
fig,ax=plt.subplots(1)
CO_CV=NormalizeToMean_CV(CO_RawData_pruned)
KO_CV=NormalizeToMean_CV(KO_RawData)
#KL note: by using KO_CV.CV, this will only plot the 'CV' variable in the data frame
KO_CV.CV.plot(kind='hist', ax=ax, bins=100, color='r')
CO_CV.CV.plot(kind='hist', ax=ax, bins=100, title='Coefficient of Variation', color='k')
fig.savefig('Coefficent of Variation')
In [63]:
#use _finalOption variable names to make life easier
KO_finalOption = KO_Norm2Mean.loc[AllKO].dropna()
CO_finalOption = CO_Norm2Mean.dropna() #already 'limited' this before the normalization
In [64]:
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
#this next line prints up some sort of pre-canned details about the program.
print(__doc__)
def kmeanCluster(data,nc):
#kmeans=KMeans(n_clusters=nc)
kmeans = KMeans(n_clusters = nc, max_iter = 1000, n_init = 50, init = 'random')
kmeans.fit(data)
newData=data.copy()
newData['kmeans']=kmeans.labels_
return newData
def PlotKmeans(KmeansPD, kSize=10, figSizeX=1, figSizeY=5, color='k'):
KmeansPD['kmeans'].plot(kind='hist', bins=kSize, color=color)
fig,axs=plt.subplots(figSizeX, figSizeY)
axs=[item for sublist in axs for item in sublist]
fig.set_size_inches(9,12)
for ax, y in zip(axs,range(kSize)):
pltData=KmeansPD[KmeansPD.kmeans==y].T.drop('kmeans')
pltData.plot(ax=ax, legend=False, grid=False, color=color)
In [65]:
#run the Kmeans clustering on the KO data along first and plot results
#koClust=kmeanCluster(KO_limited_Norm2Mean, 15)
#PlotKmeans(koClust,15,3,5, 'r')
koClust=kmeanCluster(KO_finalOption, 12)
PlotKmeans(koClust,6,2,3, 'r')
In [66]:
#plot up the results of one number of clusters for the CO data only
#coClust=kmeanCluster(CO_limited_Norm2Mean, 15)
#PlotKmeans(coClust,15,3,5, 'k')
coClust=kmeanCluster(CO_finalOption.loc[:,(dayList)], 12)
PlotKmeans(coClust,6,2,3, 'k')
From HA: By normalizing the data to the mean we can then (in theory) combine the two and cluster them together? KL 8/20/2015 note: this is essentially a list with the CO and KO concatenated into a single data frame. Note that the actual kmeans clustering does not happen until after the silhoette analysis (bc need to set # clusters) and are using the silhouette analysis to do that.
In [67]:
##First, combine the CO and the KO data
#Combined_KO_CO_MeanNorm=KO_limited_Norm2Mean.append(CO_limited_Norm2Mean)
Combined_KO_CO_final=KO_finalOption.append(CO_finalOption.loc[:,(dayList)])
In [68]:
filename = 'nClustersRequired.py'
%run $filename
In [69]:
#cheat for the moment...save the data for the data I have as a CSV file and then read it in.
#figure out the format later.
dataFilename = 'NB_combined_for_kmeans.csv'
Combined_KO_CO_final.to_csv(dataFilename)
data = Data(dataFilename)
#pattern_labels are the rows...for us this will the RInumber
pattern_labels = []
patterns_data, pattern_labels = data.load_data()
In [70]:
def forest_run(dimensions, patterns, pattern_labels, metric='qe', fNamePrefix = '', k_up=20, k_down=2, simulations=55, iterations=50):
"""
A method for watching Forest Gump run
:param dimensions: the dimensionality of the data
:param patterns: the data itself
:param metric: the quality metric
:fNamePrefix: put text in here if I want to add a unique prefix to the data file spit out at end
:param k_up: the maximum number of clusters
:param k_down: the minimum number of clusters
:param simulations: the number of simulations for each k
:param iterations: the number of iterations for each k-means pass
"""
# variable to store the best result
best_clustering = None
# the quality of that result
best_quality = 1000.00
# write results out to file while simulating
file_out = fNamePrefix + 'MonteCarloFinalResults' + '_' + metric + '.csv'
#with open(file_out, 'w', newline='') as f: #newline doesn't work here
with open(file_out,'w') as f:
# different k values to test on
for i in range(k_down, k_up):
num_clusters = i
# number of retries / simulations
print('working on ' , i, '# of kmeans groups') #works, but doesn't look as I intended.
for j in range(simulations):
# create a clustering solution and apply k-means
clustering = Clustering(dimensions, num_clusters, patterns, 0.0001)
clustering.k_means_clustering(iterations)
# used to compute quality of the solution
quality = ClusteringQuality(clustering, 0.0001)
this_quality = 0.0
if metric == 'qe':
this_quality = quality.quantization_error()
if metric == 'si':
this_quality = quality.average_silhouette_index()
if metric == 'db':
this_quality = quality.davies_bouldin()
# update the best clustering
if this_quality < best_quality:
best_quality = this_quality
best_clustering = clustering
#print("Updated best clustering") #comment out, clogging up display
# write result to the file
result = [num_clusters, this_quality]
for x in result:
f.write(str(x))
f.write(",")
f.write("\n")
f.flush()
#print(j, result) #comment out, clogging up display
# print the actual clustering out to console...comment this out, too much information
#best_clustering.print_solution(pattern_labels)
In [71]:
dimensions = 5 #this is a function of the data itself. In the NB data we have five sampling days.
setSimulations = 100
setIterations = 100 #this is the default from the Turing Finance code
setKup = 20
setKdown = 2
In [72]:
#for now, set if to False for the forest_run in the next three cells...that is time consuming
In [73]:
prefix = 'KO_norm2mean_'
if False:
forest_run(dimensions, patterns_data, pattern_labels, metric='db', fNamePrefix = prefix,
simulations=setSimulations, k_down=setKdown, k_up=setKup, iterations = setIterations)
#read in the results
riScores_db=pd.read_csv((prefix + 'MonteCarloFinalResults_db.csv'),header=None,delimiter=',',
index_col=False,names=['nClusters', 'score'])
#optimal cluster solution has the smallest Davies-Bouldin index
In [74]:
grouped = riScores_db.groupby('nClusters')
means = grouped.mean().unstack()
errors = grouped.std().unstack()
fig, ax = plt.subplots()
plt.plot(range(setKdown,setKup),means,marker = 'o',color = '#1b9e77')
plt.errorbar(range(setKdown,setKup),means,errors,marker = 'o',color = '#1b9e77')
plt.title('Kmeans, Davies-Bouldin')
ax.set_xlabel('nClusters')
plt.show()
In [75]:
plt.gcf().canvas.get_supported_filetypes()
Out[75]:
In [76]:
#need this to make a file where the text is actually editable (as oppossed to outlines)
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
fig.savefig('DaviesBouldinV2.pdf')
In [77]:
prefix = 'KO_norm2mean_'
if False:
forest_run(dimensions, patterns_data, pattern_labels, metric='qe', fNamePrefix = prefix,
simulations=setSimulations, k_down=setKdown, k_up=setKup, iterations = setIterations)
#now read in the results
riScores_qe=pd.read_csv((prefix + 'MonteCarloFinalResults_qe.csv'),header=None,delimiter=',',
index_col=False,names=['nClusters', 'score'])
In [78]:
#goal is to minimize quantization error. QE is the distance between a sample
#and its representation. Lower quantization errors represent a good data cluster.
In [79]:
grouped = riScores_qe.groupby('nClusters')
means = grouped.mean().unstack()
errors = grouped.std().unstack()
fig, ax = plt.subplots()
plt.plot(range(setKdown,setKup),means,marker = 'o',color = '#1b9e77')
plt.errorbar(range(setKdown,setKup),means,errors,marker = 'o',color = '#1b9e77')
plt.title('Kmeans, Quantization error')
ax.set_xlabel('nClusters')
plt.show()
In [ ]:
##silhouette is really slow cfd to the other
prefix = 'KO_norm2mean_'
# #silhouette is quite slow cfd to the other two metrics
if False:
forest_run(dimensions, patterns_data, pattern_labels, metric='si', fNamePrefix = prefix,
simulations=setSimulations, k_down=setKdown, k_up=setKup, iterations = setIterations)
riScores_si=pd.read_csv((prefix + 'MonteCarloFinalResults_si.csv'),header=None,delimiter=',',
index_col=False,names=['nClusters', 'score'])
##note, want to maximize the silhouette value for each pattern in the dataset
In [ ]:
grouped = riScores_si.groupby('nClusters')
means = grouped.mean().unstack()
errors = grouped.std().unstack()
fig, ax = plt.subplots()
plt.plot(range(setKdown,setKup),means,marker = 'o',color = '#1b9e77')
plt.errorbar(range(setKdown,setKup),means,errors,marker = 'o',color = '#1b9e77')
plt.title('Kmeans, silhouette index')
ax.set_xlabel('nClusters')
plt.show()
#remember, want to maximize this value
In [80]:
#setting # of clusters manually, also some good options with lower # of clusters I think
#this number will get used later when plotting up the BRITE categories and the Kmeans clusters
makeNclusters = 6
In [81]:
#do the K-means clustering with the final # of clusters
CcoClust=kmeanCluster(Combined_KO_CO_final, makeNclusters) #was 18
#this will result in a data frame with the kmeans cluster as an added column. Remember
#this will have RI numbers for the compounds
In [82]:
def PlotKmeansCombined(KmeansPD, kSize=10, figSizeX=1, figSizeY=5, color='k'):
KmeansPD['kmeans'].plot(kind='hist', bins=kSize, color='k',range = (0,kSize),align = 'left')
fig,axs=plt.subplots(figSizeX, figSizeY)
axs=[item for sublist in axs for item in sublist]
fig.set_size_inches(15,9)
i=KmeansPD.index
i=list(i)
Ks=re.compile('K.*')
#Cs=re.compile('C.*')
Cs = re.compile('R.*') #this is the RInumber I created...for the moment, do not need the Cnumber
C = filter(Cs.search, i)
K = filter(Ks.search, i)
Ksplit=KmeansPD.loc[K]
Csplit=KmeansPD.loc[C]
for ax, y in zip(axs,range(kSize)):
KData=Ksplit[Ksplit.kmeans==y].T.drop('kmeans')
KData.plot(ax=ax, legend=False, grid=False, color='b')
CData=Csplit[Csplit.kmeans==y].T.drop('kmeans')
CData.plot(ax=ax, legend=False, grid=False, color='r')
SumKC=len(KData.T)+len(CData.T)
KPct=(len(KData.T))
CPct=(len(CData.T))
ax.set_title('nGenes ' + str(KPct) + ', nCpds ' + str(CPct) + ', Cluster ' + str(y))
ax.set_ylim([0,5])
mpl.rcParams['pdf.fonttype'] = 42
fig.savefig('CombinedKOandCO_Kmeans.pdf')
PlotKmeansCombined(CcoClust,makeNclusters,2,3, 'r')
In [83]:
#But...for the CheckRelatedness...do need to go back to the cNumber...
#for now, easiest to just make yet another matrix and put the cNumbers back in.
forRelatedness = CcoClust.copy(deep=True) #make a copy of the CcoClust data frame
forRelatedness.insert(0,'KEGG',"") #add a column called 'KEGG'
forRelatedness.head(5)
Out[83]:
In [84]:
for idx in range(0,len(forRelatedness)):
t = forRelatedness.iloc[idx,:].name
if t[0]=='R':
#go find the matching cNumber in CO_RawData_all
t2 = CO_fromMATLAB.loc[t,('cNumber')]
forRelatedness.ix[idx,('KEGG')] = t2
elif t[0] == 'K':
#just copy the K number over
forRelatedness.ix[idx,('KEGG')] = t
In [85]:
def CheckRelatedness(inClust,nC):
df=pd.DataFrame(columns=['Common Rxn','No linked Rxn'], index=range(nC))
for n in range(nC):
kClust=inClust[inClust.kmeans==n]
#i=kClust.index
i = kClust.KEGG #change the new column I created with Cnumbers and Knumbers
i=list(i)
#note...re is one of the things imported at the very beginning
Csearc=re.compile('C.*') #re is regular expression...perl-like; re.compile bascially makes an object
Cs = filter(Csearc.search, i)
Ksearc=re.compile('K.*')
Kis = filter(Ksearc.search, i)
Kis=set(Kis)
Ks=[]
for c in Cs:
if c in CO_withKO.keys():
Ks.append(CO_withKO[c]['Related KO'])
Ks=set([item for sublist in Ks for item in sublist])
df.loc[n,'Common Rxn']=len(Kis.intersection(Ks))
df.loc[n, 'No linked Rxn']=len(Kis)-len(Kis.intersection(Ks))
df.plot(kind='bar', stacked=True, colormap=pal.colorbrewer.diverging.PRGn_5.get_mpl_colormap(), grid=False)
CheckRelatedness(forRelatedness, makeNclusters)
pick up here to change to using biopython module to get maps with merged KO and CO data within a K means group 12 November 2015
In [86]:
from Bio import SeqIO
from Bio.KEGG.REST import *
from Bio.KEGG.KGML import KGML_parser
from Bio.Graphics.KGML_vis import KGMLCanvas
from Bio.Graphics.ColorSpiral import ColorSpiral
from IPython.display import Image, HTML
# import random #seems like I can probably skip this, but just comment out in case that is not true
# A bit of code that will help us display the PDF output
def PDF(filename):
return HTML('<iframe src=%s width=700 height=350></iframe>' % filename)
# A bit of helper code to shorten long text
def head(text, lines=10):
""" Print the first lines lines of the passed text.
"""
print '\n'.join(text.split('\n')[:lines] + ['[...]'])
In [87]:
#set up a function to get the list of K orthologues for a given pathway (must be defined as ko00140 NOT map00140)
def getKfrom_ko(ko_id):
pathway_file = kegg_get(ko_id).read() # query and read the pathway
K_list = []
current_section = None
for line in pathway_file.rstrip().split("\n"):
section = line[:12].strip() # section names are within 12 columns
if not section == "":
current_section = section
if current_section == "ORTHOLOGY":
K_identifiers = line[12:].split("; ")
t = K_identifiers[0]
K_id = t[0:6]
if not K_id in K_list:
K_list.append(K_id)
return K_list
In [88]:
#set up a function to get the list of compounds for a given pathway (must be defined as ko00140 NOT map00140)
def getCfrom_ko(ko_id):
pathway_file = kegg_get(ko_id).read() # query and read the pathway
compound_list = []
current_section = None
for line in pathway_file.rstrip().split("\n"):
section = line[:12].strip() # section names are within 12 columns
if not section == "":
current_section = section
if current_section == "COMPOUND":
compound_identifiers = line[12:].split("; ")
t = compound_identifiers[0]
compound_id = t[0:6]
if not compound_id in compound_list:
compound_list.append(compound_id)
return compound_list
In [89]:
allPathways = kegg_list("pathway").read()
len(allPathways.split('\n'))
Out[89]:
In [90]:
#so, 481 pathways at KEGG, not all of which are likely to be interesting.
#up to 483 on 12/16/2015
#and up to 485 by 2/9/2016
In [91]:
trimPath = []
current_section = None
for line in allPathways.rstrip().split("\n"):
# Tracer()()
tp = line[8:13]
trimPath.append('ko' + tp)
In [92]:
#have some cases where KEGG will send back a pathway, but the pathway itself is not searchable...seems to
#be a KEGG bug, 'ko00351' was first, then realized there are many of these,
#did this list manually since I thought it would be short...
toDelete = ('ko00351','ko01010','ko01060', 'ko01061', 'ko01062', 'ko01063', 'ko01064', 'ko01065', 'ko01066',
'ko01070', 'ko07011', 'ko07012', 'ko07013', 'ko07014', 'ko07015', 'ko07016', 'ko07017', 'ko07018', 'ko07019',
'ko07020', 'ko07021', 'ko07023', 'ko07024', 'ko07025', 'ko07026', 'ko07027', 'ko07028', 'ko07029', 'ko07030',
'ko07031', 'ko07032', 'ko07033', 'ko07034', 'ko07035', 'ko07036', 'ko07037', 'ko07038', 'ko07039', 'ko07040',
'ko07041', 'ko07042', 'ko07043', 'ko07044', 'ko07045', 'ko07046', 'ko07047', 'ko07048', 'ko07049', 'ko07050',
'ko07051', 'ko07052', 'ko07053', 'ko07054', 'ko07055', 'ko07056', 'ko07057', 'ko07110', 'ko07112', 'ko07114',
'ko07117', 'ko07211', 'ko07212', 'ko07213', 'ko07214', 'ko07215', 'ko07216', 'ko07217', 'ko07218', 'ko07219',
'ko07220', 'ko07221', 'ko07222', 'ko07223', 'ko07224', 'ko07225', 'ko07226', 'ko07227', 'ko07228', 'ko07229',
'ko07230', 'ko07231', 'ko07232', 'ko07233', 'ko07234', 'ko07235','ko04933')
#probably a way to do this without the for loop, but this will work
for item in toDelete:
trimPath.remove(item)
In [93]:
colLabel = ['nCpds','nGenes'] #starting with this is easiest - makes one list, no need to flatten
for item in range(makeNclusters):
colLabel.append('Km' + str(item) + '_cpd')
colLabel.append('Km' + str(item) + '_gene')
gatherCounts = pd.DataFrame(0, index = trimPath, columns = colLabel)
In [94]:
# #useColors = pal.colorbrewer.diverging.PiYG_4.hex_colors
# useColors = pal.colorbrewer.qualitative.Set3_4_r.hex_colors
# # useColors = pal.colorbrewer.qualitative.Accent_4_r.hex_colors
# # useColors = pal.colorbrewer.qualitative.Dark2_4.hex_colors
# # cmap=pal.colorbrewer.sequential.YlOrRd_5.get_mpl_colormap()
#manually set the colors
#useColors = ('#e41a1c','#377eb8','#f7f7f7','#4daf4a','#984ea3') #[red,blue,white,green,purple]
useColors = pal.colorbrewer.qualitative.Set3_4.hex_colors
useColors.insert(2,'#f7f7f7')
In [ ]:
#messing around with colors...change back to red/blue/white/green/purple for now
useColors = pal.colorbrewer.qualitative.Set1_4.hex_colors
useColors.insert(2,'#f7f7f7') #insert the white
#this set of looks works on one pathway at a time, only plotting if I get more than one case with a reaction
#containing both a gene and a metabolite within the pathway of interest
if True: #can be time consuming (off for editing).
#setup the strings to match first
rnString = re.compile('(?:[rn:R])(\d+)$') #will return R00190
cpdString = re.compile('(?:[cpd:C])(\d+)$') #will return C00190
size = 20 #turns out I can increase the size of the compounds in the plots
#for kn in range(makeNclusters):
#for kn in range(3,6):
for kn in range(1):
fullSet = set(forRelatedness.KEGG)
oneK = forRelatedness[forRelatedness.kmeans == kn] #get gene & transcript information for one Kmeans group
getKm = 'Km' + str(kn)
#check if the directories exist
directoryPDF = 'plots_6Kmeans' + str(kn) + '/PDFfiles'
if not os.path.exists(directoryPDF):
os.makedirs(directoryPDF)
#check if the directories exist
directoryPNG = 'plots_6Kmeans' + str(kn) + '/PNGfiles'
if not os.path.exists(directoryPNG):
os.makedirs(directoryPNG)
for item in trimPath: #searching within one pathway at a time
#for item in shortList: #short list for testing
#print item
plotPathway = [] #gather up yes/no and will only plot if have linked genes/mtabs
genes = getKfrom_ko(item)
compounds = getCfrom_ko(item)
gatherCounts.loc[item,'nCpds'] = len(compounds)
gatherCounts.loc[item,'nGenes'] = len(genes)
#have to track genes and compounds differently for the biopython plotting later on
setG = set(genes)
setC = set(compounds)
setB = set(oneK.KEGG)
intGenes = setG.intersection(setB)
intCompounds = setC.intersection(setB)
gatherCounts.loc[item,(getKm + '_gene')] = len(intGenes)
gatherCounts.loc[item,(getKm + '_cpd')] = len(intCompounds)
#now, before plotting, look for intersection of genes/compounds using the reaction information
current_section = None
for gen in intGenes: #go through each gene...one at a time
countCpd = []
countGene = []
rnList = kegg_link('reaction',gen).read() #get the list of reactions for that gene
#sometimes there is a KEGG issue and this crashes out...usually restarting helps
for line in rnList.rstrip().split('\n'):
m = rnString.search(line) #get the reaction number
cpdList = kegg_link('cpd',m.group(0)).read() #now go get the compounds for that reaction
#can have no compounds in a reaction (only glycans, begin with G, nothing I have matched)
if len(cpdList) > 1: #will be true if cpdList includes compounds
for line2 in cpdList.rstrip().split('\n'):
m2 = cpdString.search(line2).group(0)
#now that I have a compound, check if it is in intCompounds
if m2 in intCompounds:
countCpd.append(m2)
countGene.append(gen)
plotPathway.append('yes')
##First, plot the PNG files (one for each reaction within a pathway)
if len(countCpd) > 0:
kData = pd.DataFrame(columns = dayList)
for k in set(countGene):
kData = kData.append(oneK.ix[k,dayList])
cData = pd.DataFrame(columns = dayList)
for co in set(countCpd):
#convert CO to RI, can have multiple options
j = findRInumber(oneK,co)
cData = cData.append(oneK.loc[j,dayList])
fig,ax = plt.subplots(1)
cData.T.plot(color = 'k',ax=ax)
kData.T.plot(color = 'r',ax=ax)
handles, labels = ax.get_legend_handles_labels()
#convert the RI numbers to COnumbers for the figure
for ia, a in enumerate(labels):
#add compound/gene name to the legend
##kegg_list('C00020').read()
#will have annoying tabs, use this to find them
if a[0]== 'R':
tLabel = convertRItoCO(CO_fromMATLAB,a)
fn = kegg_list(tLabel).read()
labels[ia] = fn
elif a[0] == 'K':
fn = kegg_list(a).read()
labels[ia] = fn
ax.legend(handles, labels, bbox_to_anchor = ([-1, 0.5]))
fig.suptitle('pathway ' + item + ', Kmeans grp ' + str(kn))
pngName = 'pathway' + item + '_' + m.group(0) + '.png'
fig.savefig(directoryPNG + '/' + pngName, bbox_inches = 'tight')
plt.close()
pngName = None #empty it in case that is where I am having issues
if len(plotPathway) > 0: # not empty
## plot the pathway map for this pathway, get details from KEGG for plotting
pathway = KGML_parser.read(kegg_get(item, "kgml"))
for element in pathway.orthologs:
#print element.name
for graphic in element.graphics:
tg = element.name[3:9] #skip over the 'ko:'
if (tg in intGenes):
#in the pathway AND in the set for this particular K means group
graphic.bgcolor = useColors[0] #red
elif (tg in fullSet) and (tg in genes) and (tg not in intGenes):
#in the pathway AND in the set of genes from RI, but *not* in this Kmeans group
graphic.bgcolor = useColors[1] #blue
elif (tg not in fullSet) and (tg in genes):
#in the pathway, but *not* in anything from the RI samples
graphic.bgcolor = useColors[2] #white
# Change the colours of compounds
for element in pathway.compounds:
for graphic in element.graphics:
tc = element.name[4:10] #skip over the 'cpd:'
if (tc in intCompounds):
#in the pathway AND in the set for this particular K means group
graphic.bgcolor = useColors[3] #green
graphic.width = size
graphic.height = size
elif (tc in fullSet) and (tc in compounds) and (tc not in intCompounds):
#in the pathway AND in the set of compounds from RI, but *not* in this Kmeans group
graphic.bgcolor = useColors[4] #purple
graphic.width = size
graphic.height = size
elif (tc not in fullSet) and (tc in compounds):
#in the pathway, but *not* in anything from the RI samples
graphic.bgcolor = useColors[2] #white
canvas = KGMLCanvas(pathway, import_imagemap=True)
pdfName = 'mapWithColors_' + str(item) + '.pdf'
canvas.draw(directoryPDF + '/' + pdfName)
#PDF(fName) #comment this out to *not* see the PDF within the iPython notebook':
In [ ]:
#want to export gatherCounts, with the added pathway name as a new column
gatherCounts['pathwayInfo'] = ''
gatherCounts['pathwayGroup_A'] = ''
gatherCounts['pathwayGroup_B'] = ''
In [ ]:
#organize pathways into the groups defined in the BRITE file (didn't work well for compounds,
#but the pathway groups seem useful)
def ReadBRITEfile(briteFile):
forBrite = pd.DataFrame(columns = ['map','A','B','C','wholeThing'])
# set up the expressions to match each level in the BRITE hierarchy
textA = re.compile(r'(^A<b>)(.+)(</b>)\s*(.*)$')
textB = re.compile(r'(^B)\s*(.*)$')
textC = re.compile(r'(\d+)\s*(.*)$')
#this relies on the fact that the rows are in order: A, with B subheadings, then C subheadings
setA = []
idxA = []
setB = []
setC = []
with open(briteFile) as f:
for idx,line in enumerate(f):
if line[0] is not '#': #skip over the comments
mA = textA.search(line)
mB = textB.search(line)
mC = textC.search(line)
if mA:
setA = mA.group(2)
#house cleaning (probably c)
idxA = idx
forBrite.loc[idx,'A'] = setA
forBrite.loc[idx,'wholeThing'] = line #using this as a double check for now
#forBrite.loc[idx,'map'] = mC.group(1)
elif mB:
setB = mB.group(2)
forBrite.loc[idx,'A'] = setA
forBrite.loc[idx,'B'] = setB
forBrite.loc[idx,'wholeThing'] = line
#forBrite.loc[idx,'map'] = mC.group(1)
elif mC:
#Tracer()()
setC = mC.group(2)
forBrite.loc[idx,'A'] = setA
forBrite.loc[idx,'B'] = setB
forBrite.loc[idx,'C'] = setC
forBrite.loc[idx,'wholeThing'] = line
forBrite.loc[idx,'map'] = mC.group(1)
return forBrite
In [ ]:
D = glob.glob('br08901.keg') #from http://www.genome.jp/kegg-bin/get_htext?br08901.keg; 12/15/2015
allBRITE=[]
for idx,nof in enumerate(D):
allBRITE = ReadBRITEfile(nof)
allBRITE.loc[allBRITE['map']=='01100']
In [ ]:
#KEGG seems to be updating pathways...finding some where gatherCounts has a pathway,
#but it is missing from the BRITE file
#ko04139, added to KEGG 12/1/15, pathway is: 'regulation of mitophagy'
#ko04211, added to KEGG 12/14/15, pathway is 'longevity regulating pathway mammal
#let's delete it from gatherCounts
gatherCounts = gatherCounts.drop(['ko04139'])
gatherCounts = gatherCounts.drop(['ko04211'])
#note that this cell will only work once
In [ ]:
#put the pathway name and group into the data frame before exporting it
for item in gatherCounts.index:
pathstr = kegg_list(item).read()
#this next line splits the string at the '\t', then keeps the piece at index = 1, and strips off the '\n'
gatherCounts.loc[item,('pathwayInfo')] = pathstr.split('\t')[1].rstrip()
t = allBRITE.loc[allBRITE['map']==item[2:]]
gatherCounts.set_value(item,'pathwayGroup_A',t['A'].values[0])
gatherCounts.set_value(item,'pathwayGroup_B',t['B'].values[0])
del t
In [ ]:
if True:
#now export the result as CSV file
gatherCounts.to_csv('pathways_with_Kmeans_KOnorm2Mean.2015.12.15.csv', header = True)
In [ ]:
#now...save all that so I don't have to do this everytime BUT be careful with re-assigning K means group numbers!
cpk.dump(gatherCounts, open('gatherCounts_norm2mean.pickle', 'wb'))
cpk.load(open('gatherCounts_norm2mean.pickle','rb'))
gatherCounts.head(2)
In [ ]:
colLabel
In [ ]:
newCols = colLabel[2:]
In [ ]:
cpdCols = colLabel[2::2]
cpdCols
In [ ]:
geneCols = colLabel[3::2]
geneCols
In [ ]:
#only keep the ones where I have some value...no sense in tracking zeros
s = gatherCounts[(gatherCounts.loc[:,newCols].values > 0).any(axis=1)]
pGroup = pd.unique(s.pathwayGroup_A.ravel())
In [ ]:
dfHighest = pd.DataFrame(index = pGroup,columns = newCols)
#do the math - add up the genes/cpds by higher level grouping
for i, group in s.groupby('pathwayGroup_A'):
d2 = group.loc[:,newCols]
out = d2.sum(axis=0)
dfHighest.loc[i,newCols] = out
In [ ]:
# playing around with color palettes
useColors = pal.colorbrewer.qualitative.Paired_12.hex_colors
dfHighest.plot(kind = 'bar',color=useColors)
In [ ]:
useColors = pal.colorbrewer.qualitative.Set1_6.hex_colors
toPlot = dfHighest.loc[:,cpdCols]
toPlot.plot(kind = 'bar',color = useColors)
In [ ]:
##let's narrow in on the metabolism group since that is the only one that is really interesting
plotMetabolism = gatherCounts[gatherCounts.loc[:,'pathwayGroup_A']=='Metabolism']
In [ ]:
s = plotMetabolism[(plotMetabolism.loc[:,newCols].values > 0).any(axis=1)]
dataToPlot = s.loc[:,newCols]
pGroup = pd.unique(plotMetabolism.pathwayGroup_B.ravel())
newDFmtab = pd.DataFrame(index = pGroup,columns = newCols)
for i, group in s.groupby('pathwayGroup_B'):
d2 = group.loc[:,newCols]
out = d2.sum(axis=0)
newDFmtab.loc[i,newCols] = out
In [ ]:
useColors = pal.colorbrewer.qualitative.Set1_6.hex_colors
toPlot_cpds = newDFmtab.loc[:,cpdCols]
toPlot_cpds.plot(kind = 'barh',color=useColors)
toPlot_cpds.to_csv('compounds_byKmeans.csv')
In [ ]:
useColors = pal.colorbrewer.qualitative.Paired_12.hex_colors
toPlot_cpds = newDFmtab.loc[:,cpdCols]
toPlot_cpds.T.plot(kind = 'barh',color=useColors,figsize = (12,12))
plt.legend(bbox_to_anchor=([-0.15, 0.5]))
In [ ]:
s = plotMetabolism[(plotMetabolism.loc[:,newCols].values > 0).any(axis=1)]
pGroup = pd.unique(plotMetabolism.pathwayInfo.ravel())
newDFmtabLower = pd.DataFrame(index = pGroup,columns = newCols)
for i, group in s.groupby('pathwayInfo'):
d2 = group.loc[:,newCols]
out = d2.sum(axis=0)
newDFmtabLower.loc[i,newCols] = out
In [ ]:
testing = toPlot_cpds.loc[:,'Km2_cpd']
testing.plot(kind = 'barh',color = 'blue')
plt.legend(bbox_to_anchor=([-1, 0.5]))
In [ ]:
toPlot_cpds_proportion=toPlot_cpds.copy()
toPlot_cpds['sum']=toPlot_cpds.sum(axis=1)
for i in toPlot_cpds_proportion.columns:
toPlot_cpds_proportion[i]=toPlot_cpds[i]/toPlot_cpds['sum']
toPlot_cpds=toPlot_cpds.T.drop('sum').T
In [ ]:
#what about the genes?
toPlot_genes = newDFmtab.loc[:,geneCols]
toPlot_genes_proportion=toPlot_genes.copy()
toPlot_genes['sum']=toPlot_genes.sum(axis=1)
for i in toPlot_genes_proportion.columns:
toPlot_genes_proportion[i]=toPlot_genes[i]/toPlot_genes['sum']
toPlot_genes=toPlot_genes.T.drop('sum').T
toPlot_genes.to_csv('genes_byKmeans.csv')
In [ ]:
#can play around with the colors
useColors = pal.colorbrewer.qualitative.Set1_6.hex_colors
toPlot_cpds_proportion.plot(kind = 'barh',stacked=True,color=useColors)
plt.legend(bbox_to_anchor=([-1, 0.5]))
plt.title('proportions')
current_figure = plt.gcf()
mpl.rcParams['pdf.fonttype'] = 42
current_figure.savefig('cpd_proportions.pdf')
In [ ]:
toPlot_genes_proportion.plot(kind = 'barh',stacked=True,color=useColors)
plt.legend(bbox_to_anchor=([-1, 0.5]))
plt.title('proportions')
current_figure = plt.gcf()
mpl.rcParams['pdf.fonttype'] = 42
current_figure.savefig('genes_proportions.pdf')
In [ ]:
working = toPlot_genes.T
workingC = toPlot_cpds.T
In [ ]:
working['sum'] = toPlot_genes.T.sum(axis = 1)
workingC['sum'] = toPlot_cpds.T.sum(axis=1)
In [ ]:
for i in workingC.columns:
workingC[i] = workingC[i]/workingC['sum']
workingC = workingC.T.drop('sum').T
In [ ]:
for i in working.columns:
working[i] = working[i]/working['sum']
working = working.T.drop('sum').T
In [ ]:
useColors = pal.colorbrewer.qualitative.Paired_12.hex_colors
working.plot(kind = 'barh',stacked=True,color = useColors,figsize=(12,12))
plt.legend(bbox_to_anchor=([-1, 0.5]))
In [ ]:
useColors = pal.colorbrewer.qualitative.Paired_12.hex_colors
workingC.plot(kind = 'barh',stacked=True,color = useColors,figsize=(12,12),xlim=(0,1))
plt.legend(bbox_to_anchor=([-1, 0.5]))
In [ ]:
#exported this table to a CSV file for the paper (did one for compounds too)
toPlot_genes.to_csv('genes.csv')
toPlot_cpds.to_csv('cpds.csv')
In [ ]:
# plot one compound or gene (for paper)
oneCO = 'C02666'
plotOne = forRelatedness[forRelatedness['KEGG']==oneCO]
kData = plotOne.ix[:,dayList]
fig,ax = plt.subplots(1)
kData.T.plot(color = 'r',ax=ax, ylim = [0,5])
handles, labels = ax.get_legend_handles_labels()
#convert the RI numbers to COnumbers for the figure
for ia, a in enumerate(labels):
#add compound/gene name to the legend
##kegg_list('C00020').read()
#will have annoying tabs, use this to find them
if a[0]== 'R':
tLabel = convertRItoCO(CO_fromMATLAB,a)
fn = kegg_list(tLabel).read()
labels[ia] = fn
elif a[0] == 'K':
fn = kegg_list(a).read()
labels[ia] = fn
ax.legend(handles, labels, bbox_to_anchor = ([-1, 0.5]))
fig.suptitle('pathway ' + item + ', Kmeans grp ' + str(kn))
pngName = 'pathway' + item + '_' + m.group(0) + '_' + oneCO + '.png'
# fig.savefig(pngName)
In [ ]:
shortList = ['C00020','C00864','K13799']
In [ ]:
shortList
In [ ]:
#note change to get values from list with multiple items
plotOne = forRelatedness.loc[forRelatedness['KEGG'].isin(shortList)]
In [ ]:
plotOne
In [ ]:
kData = plotOne.ix[:,dayList]
fig,ax = plt.subplots(1)
# kData.T.plot(color = 'r',ax=ax, ylim = [0,5])
kData.T.plot(ax=ax, ylim = [0,5])
handles, labels = ax.get_legend_handles_labels()
#convert the RI numbers to COnumbers for the figure
for ia, a in enumerate(labels):
#add compound/gene name to the legend
##kegg_list('C00020').read()
#will have annoying tabs, use this to find them
if a[0]== 'R':
tLabel = convertRItoCO(CO_fromMATLAB,a)
fn = kegg_list(tLabel).read()
labels[ia] = fn
elif a[0] == 'K':
fn = kegg_list(a).read()
labels[ia] = fn
# ax.legend(handles, labels, bbox_to_anchor = ([-1, 0.5]))
fig.suptitle('pathway ' + item + ', Kmeans grp ' + str(kn))
pngName = 'pathway' + item + '_' + m.group(0) + '_' + oneCO + '.png'
fig.savefig(pngName)
In [ ]:
mpl.rcParams['pdf.fonttype'] = 42
fig.savefig('pantothenate.pdf')
In [ ]:
shortList = ['C00590','K00083','K03782','C02666']
plotOne = forRelatedness.loc[forRelatedness['KEGG'].isin(shortList)]
kData = plotOne.ix[:,dayList]
fig,ax = plt.subplots(1)
kData.T.plot(ax=ax, ylim = [0,5])
handles, labels = ax.get_legend_handles_labels()
#convert the RI numbers to COnumbers for the figure
for ia, a in enumerate(labels):
#add compound/gene name to the legend
##kegg_list('C00020').read()
#will have annoying tabs, use this to find them
if a[0]== 'R':
tLabel = convertRItoCO(CO_fromMATLAB,a)
fn = kegg_list(tLabel).read()
labels[ia] = fn
elif a[0] == 'K':
fn = kegg_list(a).read()
labels[ia] = fn
# ax.legend(handles, labels, bbox_to_anchor = ([-1, 0.5]))
fig.suptitle('pathway ' + item + ', Kmeans grp ' + str(kn))
pdfName = 'PhenlypropanoidPathway2' + '.pdf'
mpl.rcParams['pdf.fonttype'] = 42
fig.savefig(pdfName)
In [ ]:
In [ ]:
In [ ]:
In [ ]: